require(tidyverse)

# cleanProjidsInDataTable
#
# = input =
# datatable: two-dimensional table, such as imported by tidyverse::read_csv
# projid_columns: vector of column names, each of which should appear exactly once in datatable
#
# = output =
# a copy of the datatable where the projid_columns have each been cleaned
# into a vector of character strings exactly eight digits
cleanProjidsInDataTable = function(datatable, projid_columns) {
  copytable = data.table::copy(datatable)
  for (column in projid_columns) {
    selection = select(datatable, all_of(column))
    if (length(selection) == 1) {
      copytable[[column]] = cleanProjidsInVector(selection[[column]])
    } else {
      stop(paste("Unique '", column, "' column not found in datatable.", sep="", collapse=NULL))
    }  
  }
  return(copytable)
}

cleanProjidsInVector = function(values) {
  if (is.integer(values)) {
    return(sapply(values, cleanIntegerProjid))
  } else if (is.double(values)) {
    return(sapply(values, cleanDoubleProjid))
  } else if (is.character(values)) {
    return(sapply(values, cleanStringProjid))
  } else {
    stop("Invalid vector type for cleaning projids.")
  }
}

cleanIntegerProjid = function(value) {
  if (is.integer(value)) {
    if (is.na(value)) {
      return(NA)
    } else if (value >= 0 && value <= 99999999) {
      string = as.character(value)
      zeroes = createLeadingZeroes(as.integer(8)-nchar(string))
      return(paste(zeroes, string, sep="", collapse=""))
    } else {
      stopCleaningProjid(value)
    }  
  } else {
    stop("Attempted to clean non-integer value as integer projid.")
  }
}

createLeadingZeroes = function(count) {
  if (is.integer(count) && count >= 0) {
    return(paste(rep("0", count), collapse=""))
  } else {
    stop(paste("Invalid count of leading zeroes: ", count, sep="", collapse=NULL))
  }
}

cleanDoubleProjid = function(value) {
  if (is.double(value)) {
    if (is.na(value)) {
      return(NA)
    } else if (value == as.integer(value)) {
      return(cleanIntegerProjid(as.integer(value)))
    } else {
      stopCleaningProjid(value)
    }  
  } else {
    stop("Attempted to clean non-double value as double projid.")
  }
}

cleanStringProjid = function(value) {
  if (is.character(value)) {
    if (is.na(value)) {
      return(NA)
    } else if (str_detect(value, "^\\d{1,8}$")) {
      return(cleanIntegerProjid(as.integer(value)))
    } else {
      stopCleaningProjid(value)
    }
  } else {
    stop("Attempted to clean non-character value as character projid.")
  }
}

stopCleaningProjid = function(projid) {
  stop(paste("Invalid projid value for cleaning: ", value, sep="", collapse=NULL))
}
resource_dir = "/pastel/resources/20220203_snRNAseq_AMPAD/"
annotation = readRDS(paste0(resource_dir,"updated_annotations/annotations.rds")) # 1638882 rows
annotation = cleanProjidsInDataTable(annotation,"projid")

load(paste0(resource_dir, "updated_annotations/celltype_exp.RData"))
annotation.filt = annotation[annotation$projid %in% pheno.filt$projid,]
# pheno.filt

Number of donors:

length(unique(pheno.filt$projid))
## [1] 424
length(unique(annotation.filt$projid))
## [1] 424

Final diagnosis

Final consensus cognitive diagnosis: cogdx. Clinical consensus diagnosis of cognitive status at time of death.

counts_cogdx = as.data.frame(table(pheno.filt$cogdx))
colnames(counts_cogdx) = c("cogdx", "frequency")
counts_cogdx$cogdx_description = c("1 NCI: No cognitive impairment",
                                   "2 MCI: Mild cognitive impairment (One impaired domain) and NO other cause of CI",
                                   "3 MCI: Mild cognitive impairment (One impaired domain) AND another cause of CI",
                                   "4 AD: Alzheimer’s dementia and NO other cause of CI",
                                   "5 AD: Alzheimer’s dementia AND another cause of CI",
                                   "6 Other dementia: Other primary cause of dementia")

createDT(counts_cogdx)

Cell types:

Class

createDT(as.tibble(unique(annotation.filt[, c("cell.type", "class")])))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

Cells by donor

What is the total number of cells by donor?

length(unique(annotation.filt$projid))
## [1] 424
cell_perc <- annotation.filt %>% group_by(projid, cell.type) %>% summarise(n_cells = n()) %>% ungroup() %>% group_by(projid) %>% mutate(perc_cells = 100*(n_cells/sum(n_cells)))

cell_perc_wide <- cell_perc %>% select(projid,cell.type,perc_cells) %>% pivot_wider(values_from = perc_cells, names_from = cell.type) # cell proportions by donor 
write.table(cell_perc_wide, file = paste0(resource_dir, "updated_annotations/cellprop_snRNAseq.txt"), quote = F, row.names = F, sep = "\t")

cellsBydonor = annotation.filt %>% group_by(projid) %>% summarise(n_cells = n()) %>% arrange(n_cells)
createDT(cellsBydonor)
cell_perc$projid = as.character(cell_perc$projid)
cell_perc_diag = cell_perc %>% left_join(pheno.filt[, c("projid", "cogdx")] )
cell_perc_diag$cogdx_group = as.character(cell_perc_diag$cogdx)
cell_perc_diag$cogdx_group[cell_perc_diag$cogdx_group %in% c("1")] <- "NCI" # Normal cognition
cell_perc_diag$cogdx_group[cell_perc_diag$cogdx_group %in% c("2", "3")] <- "MCI"
cell_perc_diag$cogdx_group[cell_perc_diag$cogdx_group %in% c("4", "5", "6")] <- "AD"

cell_color_map = list(`Excitatory Neurons` = ggsci::pal_d3("category10")(10)[1],
                      `Inhibitory Neurons` = ggsci::pal_d3("category10")(10)[2],
                      Oligodendrocytes = ggsci::pal_d3("category10")(10)[3],
                      Astrocyte = ggsci::pal_d3("category10")(10)[4],
                      Microglia = ggsci::pal_d3("category10")(10)[5],
                      OPCs = ggsci::pal_d3("category10")(10)[6],
                      Endothelial = ggsci::pal_d3("category10")(10)[9],
                      Other = "gray")

cell_labels = list(ext = "Excitatory Neurons",
                   inh = "Inhibitory Neurons",
                   oli = "Oligodendrocytes",
                   ast = "Astrocyte",
                   mic = "Microglia",
                   opc = "OPCs",
                   end = "Endothelial",
                   other = "Other")

cell_perc_diag$cell.type_group = cell_perc_diag$cell.type
cell_perc_diag$cell.type_group[! cell_perc_diag$cell.type_group %in% as.character(names(cell_color_map))] <- "Other"

donor_order <- cell_perc_diag %>% filter(cell.type=="Excitatory Neurons") %>% select(projid, perc_cells) 
donor_order <- donor_order[order(donor_order$perc_cells, decreasing = T),]
donor_order$n_order = 1:nrow(donor_order)

cell_perc_diag <- cell_perc_diag %>% left_join(donor_order[,c("projid","n_order")])

ggplot(cell_perc_diag, aes(x = reorder(projid, n_order, sum), y=perc_cells, fill=cell.type_group)) +
  geom_bar(stat = "identity") +
  facet_wrap(~cogdx_group, scales = "free_x") +
  scale_fill_manual(values=cell_color_map) +
  theme_classic() + 
  easy_remove_x_axis() + 
  labs(x = "Donor", y = "% of cells", fill = "Cell type") 

Cells proportion

cell_prop_total = annotation.filt %>% group_by(cell.type) %>% summarise(n_cells = n()) %>% mutate(perc_cells = n_cells/sum(n_cells)) %>%
  arrange(-perc_cells) %>% select(cell.type, perc_cells) %>% mutate(perc_cells = scales::percent(perc_cells))

createDT(cell_prop_total)

Class proportion

class_prop_total = annotation.filt %>% group_by(class) %>% summarise(n_cells = n()) %>% mutate(perc_cells = n_cells/sum(n_cells)) %>%
  arrange(-perc_cells) %>% select(class, perc_cells) %>% mutate(perc_cells = scales::percent(perc_cells))

createDT(class_prop_total)

Demographics

Sex

counts_sex = as.data.frame( table(pheno.filt$msex))
colnames(counts_sex) = c("Sex", "Frequency")
counts_sex$Sex = as.character(counts_sex$Sex)
counts_sex$Sex[counts_sex$Sex == 1] = "Male"
counts_sex$Sex[counts_sex$Sex == 0] = "Female"

kable(counts_sex) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Sex Frequency
Female 288
Male 136

Age range

Variable: age at death.

message(paste0("Age max: ", max(pheno.filt$age_death)))
## Age max: 108.282
message(paste0("Age min: ", min(pheno.filt$age_death)))
## Age min: 67.37
ageByDonor = unique(pheno.filt[,c("projid", "age_death", "msex")])
ageByDonor$msex = as.character(ageByDonor$msex)
ageByDonor$msex[ageByDonor$msex == 1] = "Male"
ageByDonor$msex[ageByDonor$msex == 0] = "Female"

mean_f = mean(ageByDonor[ageByDonor$msex == "Female", "age_death"], na.rm = T)
mean_m = mean(ageByDonor[ageByDonor$msex == "Male", "age_death"], na.rm = T)

message(paste0("Age mean female: ", mean_f))
## Age mean female: 90.1211701388889
message(paste0("Age mean male: ", mean_m))
## Age mean male: 87.2278014705882
ggplot(ageByDonor, aes(x=age_death, fill=msex)) +
  geom_histogram(bins = 25, colour='black', position = "stack") +
  labs(x="Age", y="Donors") +
  # scale_y_continuous(breaks = (1:20)) +
  # scale_x_continuous(breaks=seq(20,120,10)) + 
  geom_vline(xintercept=mean_f, color = "red", linetype="dashed") +
  geom_vline(xintercept=mean_m, color = "blue", linetype="dashed") +
  theme_classic()

Plots by diagnosis

cogdx = Clinical consensus diagnosis of cognitive status at time of death.

Cognitive decline

Variable: cogng_random_slope. Estimated slope from random effects model for global cognition.

# head(pheno.filt[,c("projid", "amyloid", "cogdx", "tangles", "cogng_random_slope")])
ggplot(data = pheno.filt, aes(x=projid, y = cogng_random_slope)) + 
  geom_point() +
  facet_wrap(~cogdx) +
  easy_remove_x_axis() +
  labs(x = "Individual", y = "Slope for global cognition")

Amyloid

Overall amyloid level - Mean of 8 brain regions.

ggplot(data = pheno.filt, aes(x=projid, y = amyloid)) + 
  geom_point() +
  facet_wrap(~cogdx) +
  easy_remove_x_axis() +
  labs(x = "Individual", y = "Amyloid level")

Tangles

Tangle density - Mean of 8 brain regions.

ggplot(data = pheno.filt, aes(x=projid, y = tangles)) + 
  geom_point() +
  facet_wrap(~cogdx) +
  scale_y_continuous(breaks = seq(0, 60, by=10)) +
  easy_remove_x_axis() +
  labs(x = "Individual", y = "Tangles density") 

Plots for cogdx_group

T test for pairwise comparisons.

Anova for global p-value.

pheno.filt$cogdx_3gp_desc = pheno.filt$cogdx_3gp
pheno.filt$cogdx_3gp_desc[pheno.filt$cogdx_3gp_desc == 1] <- "NCI"
pheno.filt$cogdx_3gp_desc[pheno.filt$cogdx_3gp_desc == 2] <- "MCI"
pheno.filt$cogdx_3gp_desc[pheno.filt$cogdx_3gp_desc == 3] <- "AD"
  
my_comparisons = list(c("NCI", "AD"), c("NCI", "MCI"), c("MCI", "AD"))

p1 <- ggplot(pheno.filt, aes(x = cogdx_3gp_desc, y = cogng_random_slope)) + 
  geom_boxplot(notch = F, na.rm = T) + 
  stat_compare_means(comparisons = my_comparisons, method = "t.test")+
  stat_compare_means(method = "anova", label.y = 0.4) + 
  geom_jitter() +
  labs(x = "Diagnosis of cognitive status", y = "Slope for global cognition") +
  theme_classic()
  
p2 <- ggplot(pheno.filt, aes(x = cogdx_3gp_desc, y = amyloid)) + 
  geom_boxplot(notch = F, na.rm = T) + 
  stat_compare_means(comparisons = my_comparisons, method = "t.test")+
  stat_compare_means(method = "anova", label.y = 30) + 
  geom_jitter() +
  labs(x = "Diagnosis of cognitive status", y = "Amyloid levels") +
  theme_classic()

p3 <- ggplot(pheno.filt, aes(x = cogdx_3gp_desc, y = tangles)) + 
  geom_boxplot(notch = F, na.rm = T) + 
  stat_compare_means(comparisons = my_comparisons, method = "t.test")+
  stat_compare_means(method = "anova", label.y = 90) + 
  geom_jitter() +
  labs(x = "Diagnosis of cognitive status", y = "Tangles density") +
  theme_classic()

grid.arrange(p1, p2, p3, ncol=3)

Session Info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3    kableExtra_1.3.4 ggpubr_0.4.0     ggeasy_0.1.3    
##  [5] forcats_0.5.1    stringr_1.4.0    dplyr_1.0.9      purrr_0.3.4     
##  [9] readr_2.1.2      tidyr_1.2.0      tibble_3.1.7     ggplot2_3.3.6   
## [13] tidyverse_1.3.1 
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.3        sass_0.4.1        jsonlite_1.8.0    viridisLite_0.4.0
##  [5] carData_3.0-5     modelr_0.1.8      bslib_0.3.1       assertthat_0.2.1 
##  [9] highr_0.9         cellranger_1.1.0  yaml_2.3.5        pillar_1.7.0     
## [13] backports_1.4.1   glue_1.6.2        digest_0.6.29     ggsignif_0.6.3   
## [17] rvest_1.0.2       colorspace_2.0-3  htmltools_0.5.2   pkgconfig_2.0.3  
## [21] broom_0.8.0       haven_2.5.0       scales_1.2.0      webshot_0.5.3    
## [25] svglite_2.1.0     tzdb_0.3.0        generics_0.1.2    farver_2.1.0     
## [29] car_3.0-13        ellipsis_0.3.2    DT_0.23           withr_2.5.0      
## [33] cli_3.3.0         magrittr_2.0.3    crayon_1.5.1      readxl_1.4.0     
## [37] evaluate_0.15     fs_1.5.2          fansi_1.0.3       rstatix_0.7.0    
## [41] xml2_1.3.3        tools_4.1.2       data.table_1.14.2 hms_1.1.1        
## [45] lifecycle_1.0.1   munsell_0.5.0     reprex_2.0.1      ggsci_2.9        
## [49] compiler_4.1.2    jquerylib_0.1.4   systemfonts_1.0.4 rlang_1.0.2      
## [53] grid_4.1.2        rstudioapi_0.13   htmlwidgets_1.5.4 crosstalk_1.2.0  
## [57] labeling_0.4.2    rmarkdown_2.14    gtable_0.3.0      abind_1.4-5      
## [61] DBI_1.1.2         R6_2.5.1          lubridate_1.8.0   knitr_1.39       
## [65] fastmap_1.1.0     utf8_1.2.2        stringi_1.7.6     vctrs_0.4.1      
## [69] dbplyr_2.1.1      tidyselect_1.1.2  xfun_0.31